# Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Importing module
import warnings
# Warnings filter.
warnings.filterwarnings('ignore')
# Import the necessary libraries
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
train.head()
test.head()
train.describe()
test.describe()
train.columns
test.columns
# UID is unique userID value in the train and test dataset. So an index can be created from the UID feature
train.set_index(keys=['UID'],inplace=True)#Set the DataFrame index using existing columns.
test.set_index(keys=['UID'],inplace=True)
# Handling Missing value
train.isnull().sum()/len(train)*100
train=train.drop(['BLOCKID','SUMLEVEL'],axis=1)
test.isnull().sum()/len(test)*100
test=test.drop(['BLOCKID','SUMLEVEL'],axis=1)
# Imputing missing values with mean
missing_train_cols=[]
for col in train.columns:
if train[col].isna().sum() !=0:
missing_train_cols.append(col)
print(missing_train_cols)
missing_test_cols=[]
for col in test.columns:
if test[col].isna().sum() !=0:
missing_test_cols.append(col)
print(missing_test_cols)
# Missing cols are all numerical variables
for col in train.columns:
if col in (missing_train_cols):
train[col].replace(np.nan,train[col].mean(),inplace=True)
for col in test.columns:
if col in (missing_test_cols):
test[col].replace(np.nan,test[col].mean(),inplace=True)
train.isna().sum().sum()
test.isna().sum().sum()
df = train[train['pct_own']>0.1]
df.shape
df = df.sort_values(by='second_mortgage',ascending=False)
pd.set_option('display.max_columns', None)
df.head()
top_2500_second_mortgage_pctown_10 = df.head(2500)
top_2500_second_mortgage_pctown_10
import plotly.express as px
import plotly.graph_objects as go
# Visualization 1 (Geo-Map):
fig = go.Figure(data=go.Scattergeo(
lat = top_2500_second_mortgage_pctown_10['lat'],
lon = top_2500_second_mortgage_pctown_10['lng']),
)
fig.update_layout(
geo=dict(
scope = 'north america',
showland = True,
landcolor = "rgb(212, 212, 212)",
subunitcolor = "rgb(255, 255, 255)",
countrycolor = "rgb(255, 255, 255)",
showlakes = True,
lakecolor = "rgb(255, 255, 255)",
showsubunits = True,
showcountries = True,
resolution = 50,
projection = dict(
type = 'conic conformal',
rotation_lon = -100
),
lonaxis = dict(
showgrid = True,
gridwidth = 0.5,
range= [ -140.0, -55.0 ],
dtick = 5
),
lataxis = dict (
showgrid = True,
gridwidth = 0.5,
range= [ 20.0, 60.0 ],
dtick = 5
)
),
title='Top 2,500 locations with second mortgage is the highest and percent ownership is above 10 percent')
fig.show()
train['bad_debt']=train['second_mortgage']+train['home_equity']-train['home_equity_second_mortgage']
# Visualization 2:
train['bins_bad_debt'] = pd.cut(train['bad_debt'],bins=[0,0.1,.5,1], labels=["less than 10%","10-50%","50-100%"])
train.groupby(['bins_bad_debt']).size().plot(kind='pie',subplots=True,startangle=90, autopct='%1.1f%%')
plt.title('Bad Debt pct')
plt.ylabel("")
plt.show()
# Visualization 3:
train['bins_debt'] = pd.cut(train['debt'],bins=[0,0.1,.5,1], labels=["less than 10%","10-50%","50-100%"])
train.groupby(['bins_debt']).size().plot(kind='pie',subplots=True,startangle=90, autopct='%1.1f%%')
plt.title('Debt pct')
plt.ylabel("")
plt.show()
cols=['second_mortgage','home_equity','debt','bad_debt']
df_box_hamilton=train.loc[train['city'] == 'Hamilton']
df_box_manhattan=train.loc[train['city'] == 'Manhattan']
df_box_city=pd.concat([df_box_hamilton,df_box_manhattan])
df_box_city.head(4)
# Visualization 4:
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='second_mortgage', y='city',width=0.5,palette="Set3")
plt.show()
# Visualization 5:
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='home_equity', y='city',width=0.5,palette="Set3")
plt.show()
# Visualization 6:
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='debt', y='city',width=0.5,palette="Set3")
plt.show()
# Visualization 7:
plt.figure(figsize=(10,5))
sns.boxplot(data=df_box_city,x='bad_debt', y='city',width=0.5,palette="Set3")
plt.show()
# Visualization 8:
sns.distplot(train['hi_mean'])
plt.title('Household income distribution chart')
plt.show()
# Visualization 9:
sns.distplot(train['family_mean'])
plt.title('Family income distribution chart')
plt.show()
# Visualization 10:
sns.distplot(train['family_mean']-train['hi_mean'])
plt.title('Remaining income distribution chart')
plt.show()
# Visualization 11:
sns.histplot(train['pop'])
plt.title('Population distribution chart')
plt.show()
# Visualization 12:
sns.histplot(train['male_pop'])
plt.title('Male population distribution chart')
plt.show()
# Visualization 13:
sns.histplot(train['female_pop'])
plt.title('Female population distribution chart')
plt.show()
# Visualization 14:
sns.histplot(train['male_age_median'])
plt.title('Male age distribution chart')
plt.show()
# Visualization 15:
sns.histplot(train['female_age_median'])
plt.title('Female age distribution chart')
plt.show()
train["pop_density"]=train["pop"]/train["ALand"]
test["pop_density"]=test["pop"]/test["ALand"]
# Visualization 16:
sns.distplot(train['pop_density'])
plt.title('Population density distribution chart')
plt.show()
# Visualization 17:
sns.boxplot(train['pop_density'])
plt.title('Population density distribution chart')
plt.show()
train["median_age"]=(train["male_age_median"]+train["female_age_median"])/2
test["median_age"]=(test["male_age_median"]+test["female_age_median"])/2
train[['male_age_median','female_age_median','male_pop','female_pop','median_age']].head()
# Visualization 18:
sns.histplot(train['median_age'])
plt.title('Age median distribution chart')
plt.show()
train["pop"].describe()
train['pop_bins']=pd.cut(train['pop'],bins=5,labels=['very low','low','medium','high','very high'])
train[['pop','pop_bins']]
train['pop_bins'].value_counts()
train.groupby(by='pop_bins')[['married','separated','divorced']].count()
train.groupby(by='pop_bins')[['married','separated','divorced']].agg(["mean", "median"])
# Visualization 19:
pop_bin_married=train.groupby(by='pop_bins')[['married','separated','divorced']].agg(["mean"])
sns.lineplot(data=pop_bin_married)
plt.show()
rent_state_mean=train.groupby(by='state')['rent_mean'].agg(["mean"])
rent_state_mean.head()
income_state_mean=train.groupby(by='state')['family_mean'].agg(["mean"])
income_state_mean.head()
rent_perc_of_income=rent_state_mean['mean']/income_state_mean['mean']
rent_perc_of_income.head(10)
#overall level rent as a percentage of income
sum(train['rent_mean'])/sum(train['family_mean'])
#Correlation analysis and heatmap
train[["COUNTYID","STATEID","zip_code", "type","pop","family_mean",'second_mortgage', 'home_equity', 'debt','hs_degree','median_age','pct_own', 'married','separated', 'divorced']].corr()
# Visualization 20:
sns.heatmap(train[["COUNTYID","STATEID","zip_code", "type","pop","family_mean",'second_mortgage', 'home_equity', 'debt','hs_degree','median_age','pct_own', 'married','separated', 'divorced']].corr())
The economic multivariate data has a significant number of measured variables. The goal is to find where the measured variables depend on a number of smaller unobserved common factors or latent variables. 2. Each variable is assumed to be dependent upon a linear combination of the common factors, and the coefficients are known as loadings. Each measured variable also includes a component due to independent random variability, known as “specific variance” because it is specific to one variable. Obtain the common factors and then plot the loadings. Use factor analysis to find latent variables in our dataset and gain insight into the linear relationships in the data. Following are the list of latent variables:
• Highschool graduation rates
• Median population age
• Second mortgage statistics
• Percent own
• Bad debt expense
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=5,random_state=11)
train_transformed = fa.fit_transform(train.select_dtypes(exclude=('object','category')))
train_transformed.shape
train_transformed
x_train = pd.read_csv('train.csv')
x_test = pd.read_csv('test.csv')
x_train.drop(['BLOCKID','SUMLEVEL'],axis=1,inplace=True)
x_train.dropna(axis=0,inplace=True)
x_train.head()
x_train.drop_duplicates(inplace=True)
x_train.shape
x_test.head()
x_test.shape
x_test.drop(['BLOCKID','SUMLEVEL'],axis=1,inplace=True)
x_test.isna().sum()
x_test.dropna(axis=0,inplace=True)
x_test.drop_duplicates(inplace=True)
x_test.shape
imp_feature = x_train.select_dtypes(exclude=('object','category'))
imp_feature.head()
imp_feature.shape
to_drop = ['UID','COUNTYID', 'STATEID', 'zip_code', 'area_code', 'lat', 'lng']
for col in imp_feature.columns:
if col in to_drop:
imp_feature.drop(col,axis=1,inplace=True)
imp_feature.head()
x_train_features = imp_feature[['pop','rent_median','hi_median','family_median','hc_mean','second_mortgage','home_equity','debt','hs_degree','pct_own','married','separated','divorced']]
x_train_features.head()
x_train_features.shape
y_train = imp_feature['hc_mortgage_mean']
x_test_feature = x_test[['pop','rent_median','hi_median','family_median','hc_mean','second_mortgage','home_equity','debt','hs_degree','pct_own','married','separated','divorced']]
from sklearn.linear_model import LinearRegression
le = LinearRegression()
le.fit(x_train_features,y_train)
y_pred = le.predict(x_test_feature)
y_test = x_test['hc_mortgage_mean']
from sklearn.metrics import r2_score,mean_squared_error
r2_score(y_test,y_pred)
np.sqrt(mean_squared_error(y_test,y_pred))
# Visualization 21:
sns.distplot(y_pred)
plt.show()